from zipfile import ZipFile
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt

zipf = "./lim2018_files.zip"
# To get this data, please check the reference:
# Lim, C. (2018). Checking how fact-checkers check. Research & Politics, 5(3), 2053168018786848.

with ZipFile(zipf,'r') as zip:
    zip.printdir()
    listf = zip.namelist()
    finaldtm = pd.read_csv(zip.open(listf[0]))
    fc = pd.read_csv(zip.open(listf[1]))
    murky = pd.read_csv(zip.open(listf[2]))
    overlap = pd.read_csv(zip.open(listf[3]))
    pf = pd.read_csv(zip.open(listf[4]))
    overlapdtm = pd.read_csv(zip.open(listf[5]))
del zip

statements = pd.concat([pf.statement,fc.statement],ignore_index=True)
list(set(fc.category))


fc['overlap'] = None
fc['overmurky'] = None
for i in range(len(fc)):
    if fc['category'][i] == 'overlap':
        fc['overlap'][i] = 1
        fc['overmurky'][i] = 1
    elif fc['category'][i] == 'murky':
        fc['overlap'][i] = 0
        fc['overmurky'][i] = 1
    else:
        fc['overlap'][i] = 0
        fc['overmurky'][i] = 0

pf['overlap'] = None
pf['overmurky'] = None
for i in range(len(pf)):
    if pf['category'][i] == 'overlap':
        pf['overlap'][i] = 1
        pf['overmurky'][i] = 1
    elif pf['category'][i] == 'murky':
        pf['overlap'][i] = 0
        pf['overmurky'][i] = 1
    else:
        pf['overlap'][i] = 0
        pf['overmurky'][i] = 0

'''Word Embeddings'''
from sklearn.feature_extraction.text import CountVectorizer
ohe = CountVectorizer(lowercase=True, stop_words='english')
X1 = ohe.fit_transform(statements)
arr1 = X1.toarray()
ohe.get_feature_names_out()

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
ti = TfidfVectorizer()
X2 = ti.fit_transform(statements)
arr2 = X2.toarray()

# SentenceBert
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')
arr3 = model.encode(statements)
arr3.shape

''' Evaluation'''
pf_true = [bool(ele) for ele in pf['overlap'].values]
fc_true = [bool(ele) for ele in fc['overlap'].values]
xs = np.arange(0,1,0.05)

def evaluation(arr,xs):
    pfarr = arr[:1178]
    fcarr = arr[1178:]
    sim = cosine_similarity(pfarr,fcarr)
    pf_f1 = []
    fc_f1 = []
    for x in xs:
        pf_y = [any(y > x for y in tf) for tf in sim]
        fc_y = [any(y > x for y in tf) for tf in sim.T]
        pf_e = precision_recall_fscore_support(pf_true, pf_y, average='binary')
        fc_e = precision_recall_fscore_support(fc_true, fc_y, average='binary')
        pf_f1.append(pf_e[2])
        fc_f1.append(fc_e[2])
    return sim, pf_f1, fc_f1

def img_f1dist(li1,li2,li3):
    import matplotlib.pyplot as plt
    plt.plot(xs, li1, label="Count Vectorizer")
    plt.plot(xs, li2, label="TF-IDF Vectorizer")
    plt.plot(xs, li3, label="Sentence BERT")
    plt.ylim([0.1,0.8])
    plt.xlabel("Similarity score")
    plt.ylabel("F1 score for the positive class")
    plt.xticks(np.arange(0,1.1,0.1))
    plt.legend()
    plt.grid(which='major', axis='both')
    plt.show()

ohe_sim, pf_f11, fc_f11 = evaluation(arr1,xs)
tfidf_sim, pf_f12, fc_f12 = evaluation(arr2,xs)
bert_sim, pf_f13, fc_f13 = evaluation(arr3,xs)

img_f1dist(pf_f11,pf_f12,pf_f13)

# Find the max f1-score for each case
round(max(pf_f12),4)
round(max(fc_f12),4)
np.argmax(pf_f12)
np.argmax(fc_f12)
xs[np.argmax(pf_f12)]
xs[np.argmax(fc_f12)]

## Confusion Matrix for Politifact max point
x = 0.50
pf_y = [any(y>x for y in tf) for tf in tfidf_sim]

### Visualization using scikitplot
import scikitplot as skplt
from sklearn.metrics import classification_report, confusion_matrix,precision_recall_fscore_support,accuracy_score
skplt.metrics.plot_confusion_matrix(pf_true, pf_y)
plt.show()

print(classification_report(pf_true, pf_y))

accuracy = accuracy_score(pf_true, pf_y)
score = precision_recall_fscore_support(pf_true, pf_y, average='weighted')
precision, recall, fscore, k = score
print("Accuracy:",accuracy)
print("Precision:",precision)
print("Recall:",recall)
print("F score:",fscore)